In [1]:
import numpy as np
from scipy import misc
import scipy.io as scio
import matplotlib.pyplot as plt
from sklearn import metrics
import h5py
from sklearn.cluster import KMeans, MiniBatchKMeans, Birch, DBSCAN
In [2]:
def averageDistToCenter(X,C):
    avdist = np.sum(np.sqrt(np.sum((X - C)**2,axis=0)),axis=0)
    avdist /= np.maximum(np.size(X,axis=1),1)
    return avdist


def dunnIndex(X, C, labels):
    centersDistance = np.sum((C.T[np.newaxis,:,:] - C.T[:,np.newaxis,:])**2., axis=-1).T
    np.fill_diagonal(centersDistance, np.max(centersDistance))
    centersMinDistance = np.sqrt(np.min(centersDistance))
    uniqueLabels = C.shape[1]
    m = 0.0
    for l in range(uniqueLabels):
        d = averageDistToCenter(X[:,labels==l],C[:,l, np.newaxis])
        m = np.max([m,d])
        
    return centersMinDistance/m

def daviesBouldinIndex(X, C, labels):
    centersDistance = np.sqrt(np.sum((C.T[np.newaxis,:,:] - C.T[:,np.newaxis,:])**2., axis=-1).T)
    uniqueLabels = C.shape[1]
    distsToCenters = np.empty(uniqueLabels)
    for l in range(uniqueLabels):
        distsToCenters[l] = averageDistToCenter(X[:,labels==l],C[:,l, np.newaxis])
    clusterSchemeMeasure = np.empty(uniqueLabels)
    for i in range(uniqueLabels):
        clusterSchemeMeasure[i] = np.max((distsToCenters[i] + np.delete(distsToCenters,i))/np.delete(centersDistance[:,i], i), axis=-1)
        
    return np.sum(clusterSchemeMeasure)/uniqueLabels
In [3]:
with h5py.File('data.mat', 'r') as file:
    image = np.array(list(file['ImageRaw']))
# dict = scio.loadmat('data')
In [4]:
image.shape
data = np.reshape(image.T,(1000*1500,285))
In [4]:
data2 = np.reshape(image,(285,1000*1500)).T
In [6]:
data2.shape
Out[6]:
(1500000, 285)
In [5]:
k_means = KMeans(init='k-means++', n_clusters=5, n_init=1)
In [ ]:
k_means.fit(data)
In [6]:
k_means.fit(data2)
Out[6]:
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=5, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)
In [7]:
print('Dunn index: ', dunnIndex(data2.T, k_means.cluster_centers_.T, k_means.labels_))
print('Davies-Bouldin index: ', daviesBouldinIndex(data2.T, k_means.cluster_centers_.T, k_means.labels_))
Dunn index:  0.985661493124
Davies-Bouldin index:  0.92022991781
In [8]:
areaMap = np.array([[i,j] for i in range(1000) for j in range(1500)])
areaMap2 = np.array([[j,i] for i in range(1000) for j in range(1500)])
In [93]:
areaMap
Out[93]:
array([[   0,    0],
       [   1,    0],
       [   2,    0],
       ..., 
       [1497,  999],
       [1498,  999],
       [1499,  999]])
In [10]:
for i in range(5):
    plt.scatter(areaMap[k_means.labels_ == i,0], areaMap[k_means.labels_ == i,1], marker='.')
    plt.show()    
In [11]:
for i in range(5):
    plt.scatter(areaMap2[k_means.labels_ == i,0], areaMap2[k_means.labels_ == i,1], marker='.')
    plt.show()    
In [9]:
k_means10 = KMeans(init='k-means++', n_clusters=10, n_init=1)
In [10]:
k_means10.fit(data2)
Out[10]:
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=10, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)
In [11]:
print('Dunn index: ', dunnIndex(data2.T, k_means10.cluster_centers_.T, k_means10.labels_))
print('Davies-Bouldin index: ', daviesBouldinIndex(data2.T, k_means10.cluster_centers_.T, k_means10.labels_))
Dunn index:  0.608479925287
Davies-Bouldin index:  0.984311074794
In [99]:
np.unique(k_means10.labels_)
Out[99]:
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32)
In [16]:
for i in range(10):
    plt.plot(areaMap[k_means10.labels_ == i,0], areaMap[k_means10.labels_ == i,1], '.', markersize=1)
    plt.show()
In [17]:
for i in range(10):
    plt.plot(areaMap2[k_means10.labels_ == i,0], areaMap2[k_means10.labels_ == i,1], '.', markersize=1)
    plt.show()
In [12]:
k_means20 = KMeans(init='k-means++', n_clusters=20, n_init=1)
In [13]:
k_means20.fit(data2)
Out[13]:
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=20, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)
In [14]:
print('Dunn index: ', dunnIndex(data2.T, k_means20.cluster_centers_.T, k_means20.labels_))
print('Davies-Bouldin index: ', daviesBouldinIndex(data2.T, k_means20.cluster_centers_.T, k_means20.labels_))
Dunn index:  0.351281871712
Davies-Bouldin index:  1.04796181675
In [20]:
for i in range(20):
    plt.plot(areaMap[k_means20.labels_ == i,0], areaMap[k_means20.labels_ == i,1], '.',markersize=1)
    plt.show()
In [21]:
for i in range(20):
    plt.plot(areaMap2[k_means20.labels_ == i,0], areaMap2[k_means20.labels_ == i,1], '.',markersize=1)
    plt.show()
In [15]:
k_means30 = KMeans(init='k-means++', n_clusters=30, n_init=1)
In [16]:
k_means30.fit(data2)
Out[16]:
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=30, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)
In [17]:
print('Dunn index: ', dunnIndex(data2.T, k_means30.cluster_centers_.T, k_means30.labels_))
print('Davies-Bouldin index: ', daviesBouldinIndex(data2.T, k_means30.cluster_centers_.T, k_means30.labels_))
Dunn index:  0.206947417164
Davies-Bouldin index:  1.0678814906
In [24]:
for i in range(30):
    plt.plot(areaMap[k_means30.labels_ == i,0], areaMap[k_means30.labels_ == i,1], '.',markersize=1)
    plt.show()
In [41]:
plt.figure(figsize=(12,15))
plt.plot(areaMap[k_means30.labels_ == 7,0], areaMap[k_means30.labels_ == 7,1], 'g.',markersize=1)
plt.plot(areaMap[k_means30.labels_ == 24,0], areaMap[k_means30.labels_ == 24,1], 'g.',markersize=1)
plt.plot(areaMap[k_means30.labels_ == 10,0], areaMap[k_means30.labels_ == 10,1], 'b.',markersize=1)
plt.plot(areaMap[k_means30.labels_ == 5,0], areaMap[k_means30.labels_ == 5,1], 'y.',markersize=1)
plt.plot(areaMap[k_means30.labels_ == 14,0], areaMap[k_means30.labels_ == 14,1], 'k.',markersize=1)
plt.plot(areaMap[k_means30.labels_ == 26,0], areaMap[k_means30.labels_ == 26,1], 'k.',markersize=1)
plt.show()
In [18]:
mini_batch_k_means50 = MiniBatchKMeans(init='k-means++', n_clusters=50, batch_size=50, n_init=1, max_no_improvement=10, verbose=0, random_state=0)
mini_batch_k_means50.fit(data2)
Out[18]:
MiniBatchKMeans(batch_size=50, compute_labels=True, init='k-means++',
        init_size=None, max_iter=100, max_no_improvement=10, n_clusters=50,
        n_init=1, random_state=0, reassignment_ratio=0.01, tol=0.0,
        verbose=0)
In [19]:
print('Dunn index: ', dunnIndex(data2.T, mini_batch_k_means50.cluster_centers_.T, mini_batch_k_means50.labels_))
print('Davies-Bouldin index: ', daviesBouldinIndex(data2.T, mini_batch_k_means50.cluster_centers_.T, mini_batch_k_means50.labels_))
Dunn index:  0.189611580111
Davies-Bouldin index:  1.19816631723
In [50]:
for i in range(50):
    print(i)
    plt.plot(areaMap[mini_batch_k_means50.labels_ == i,0], areaMap[mini_batch_k_means50.labels_ == i,1], '.',markersize=1)
    plt.show()
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
In [59]:
plt.figure(figsize=(12,15))
plt.plot(areaMap[mini_batch_k_means50.labels_ == 7,0], areaMap[mini_batch_k_means50.labels_ == 7,1], 'b.',markersize=1)
plt.plot(areaMap[mini_batch_k_means50.labels_ == 27,0], areaMap[mini_batch_k_means50.labels_ == 27,1], 'y.',markersize=1)
plt.plot(areaMap[mini_batch_k_means50.labels_ == 35,0], areaMap[mini_batch_k_means50.labels_ == 35,1], 'y.',markersize=1)
plt.plot(areaMap[mini_batch_k_means50.labels_ == 43,0], areaMap[mini_batch_k_means50.labels_ == 43,1], 'k.',markersize=1)
plt.plot(areaMap[mini_batch_k_means50.labels_ == 41,0], areaMap[mini_batch_k_means50.labels_ == 41,1], 'k.',markersize=1)
plt.plot(areaMap[mini_batch_k_means50.labels_ == 48,0], areaMap[mini_batch_k_means50.labels_ == 48,1], 'g.',markersize=1)
plt.plot(areaMap[mini_batch_k_means50.labels_ == 39,0], areaMap[mini_batch_k_means50.labels_ == 39,1], 'g.',markersize=1)
plt.plot(areaMap[mini_batch_k_means50.labels_ == 38,0], areaMap[mini_batch_k_means50.labels_ == 38,1], 'g.',markersize=1)
plt.plot(areaMap[mini_batch_k_means50.labels_ == 31,0], areaMap[mini_batch_k_means50.labels_ == 31,1], 'g.',markersize=1)
plt.plot(areaMap[mini_batch_k_means50.labels_ == 8,0], areaMap[mini_batch_k_means50.labels_ == 8,1], 'g.',markersize=1)
plt.plot(areaMap[mini_batch_k_means50.labels_ == 28,0], areaMap[mini_batch_k_means50.labels_ == 28,1], 'k.',markersize=1)
plt.show()
In [20]:
dbscan = DBSCAN(eps=500, min_samples=10)
dbscan.fit(data2)
Out[20]:
DBSCAN(algorithm='auto', eps=500, leaf_size=30, metric='euclidean',
    metric_params=None, min_samples=10, n_jobs=1, p=None)
In [36]:
np.unique(dbscan.labels_)
Out[36]:
array([  -1,    0,    1, ..., 1409, 1410, 1411])
In [24]:
plt.figure(figsize=(12,15))
plt.plot(areaMap[dbscan.labels_ == 0,0], areaMap[dbscan.labels_ == 0,1], 'b.',markersize=1)
plt.plot(areaMap[dbscan.labels_ == 2,0], areaMap[dbscan.labels_ == 2,1], 'k.',markersize=1)
plt.plot(areaMap[dbscan.labels_ == 1,0], areaMap[dbscan.labels_ == 1,1], 'g.',markersize=1)
plt.plot(areaMap[dbscan.labels_ == 29,0], areaMap[dbscan.labels_ == 29,1], 'g.',markersize=1)
plt.show()
In [27]:
centers = np.empty((data2.shape[1],3))
for i in [0,1,2]:
    l = data2.T[:,dbscan.labels_ == i]
    centers[:,i] = np.sum(l, axis=1)/l.shape[1]
print('Dunn index: ', dunnIndex(data2.T, centers, dbscan.labels_))
print('Davies-Bouldin index: ', daviesBouldinIndex(data2.T, centers, dbscan.labels_))
Dunn index:  3.28500148499
Davies-Bouldin index:  0.516584648011